global functions (convert key to dummy)
source("thesis_functions.R")
key.dummy <- function(x){
x <- x %>% mutate(key1 = ifelse(x$key == 1, 1, 0),
key2 = ifelse(x$key == 2, 1, 0),
key3 = ifelse(x$key == 3, 1, 0),
key4 = ifelse(x$key == 4, 1, 0),
key5 = ifelse(x$key == 5, 1, 0),
key6 = ifelse(x$key == 6, 1, 0),
key7 = ifelse(x$key == 7, 1, 0),
key8 = ifelse(x$key == 8, 1, 0),
key9 = ifelse(x$key == 9, 1, 0),
key10 = ifelse(x$key == 10, 1, 0),
key11 = ifelse(x$key == 11, 1, 0)) %>% select(-key)
x
}
#train/valid/test function from data mining class:
partition.3 <- function(data, prop.train, prop.val){
# select a random sample of size = prop.train % of total records
selected1 <- sample(1:nrow(data), round(nrow(data)*prop.train), replace = FALSE)
# create training data which has prop.train % of total records
data.train <- data[selected1,]
# select a random sample of size = prop.val % of the total records
rest <- setdiff(1:nrow(data), selected1)
selected2 <- sample(rest, round(nrow(data)*prop.val), replace = FALSE)
# create validation data which has prop.val % of total records
data.val <- data[selected2,]
# create testing data with the remaining records
data.test <- data[setdiff(rest, selected2),]
return(list(data.train=data.train, data.test=data.test, data.val=data.val))
}
#creates plot for optimal cut off value
opt.cut.func <- function(model, data){
# create a vector for cutoff values
cutoff <- seq(0, 1, 0.01)
# create three empty vectors of same length
sensitivity.vec <- rep(NA, length(cutoff))
specificity.vec <- rep(NA, length(cutoff))
ssdiff.vec <- rep(NA, length(cutoff))
kappa.vec <- rep(NA, length(cutoff))
# For loop.
for(i in 1:length(cutoff)){
pred.prob.val <- predict(model, newdata = data, type = "response")
pred.y.val <- as.factor(ifelse(pred.prob.val > cutoff[i], 1, 0))
# warning messages galore because the probability of a value actually being popular is SO LOW
c <- confusionMatrix(pred.y.val, data$popular,
positive = "1")
sensitivity.vec[i] <- c$byClass["Sensitivity"]
specificity.vec[i] <- c$byClass["Specificity"]
ssdiff.vec[i] <- abs(sensitivity.vec[i] - specificity.vec[i])
kappa.vec[i] <- c$overall["Kappa"]
}
return(list(cutoff = cutoff, sensitivity.vec = sensitivity.vec, specificity.vec = specificity.vec, ssdiff.vec = ssdiff.vec, kappa.vec = kappa.vec))
}
reg.opt.cut.func <- function(model, data){
cutoff <- seq(0, 1, 0.01)
# create three empty vectors of same length
sensitivity.vec <- rep(NA, length(cutoff))
specificity.vec <- rep(NA, length(cutoff))
ssdiff.vec <- rep(NA, length(cutoff))
kappa.vec <- rep(NA, length(cutoff))
# For loop.
for(i in 1:length(cutoff)){
pred.prob.val <- predict(model, s=model$bestTune, data, type = "prob")
pred.y.val <- as.factor(ifelse(pred.prob.val[,2] > cutoff[i], 1, 0))
# warning messages galore because the probability of a value actually being popular is SO LOW
c <- confusionMatrix(pred.y.val, data$popular,
positive = "1")
sensitivity.vec[i] <- c$byClass["Sensitivity"]
specificity.vec[i] <- c$byClass["Specificity"]
ssdiff.vec[i] <- abs(sensitivity.vec[i] - specificity.vec[i])
kappa.vec[i] <- c$overall["Kappa"]
}
return(list(cutoff = cutoff, sensitivity.vec = sensitivity.vec, specificity.vec = specificity.vec, ssdiff.vec = ssdiff.vec, kappa.vec = kappa.vec))
}
opt.cut.plot <- function(opt.out){
plot(opt.out$cutoff, opt.out$sensitivity.vec,xlab = "cutoff", type = "l", col = "blue")
lines(opt.out$cutoff, opt.out$specificity.vec, type = "l", col = "green")
lines(opt.out$cutoff, opt.out$kappa.vec, type = "l", col = "red")
legend( x="right", legend=c("Sensitivity","Specificity", "Kappa"),
col=c("blue","green","red"), lty = 1, lwd=1)
}
data :D
data <- fread("kpopdata.csv")
data <- mutate(data, ArtistType = as.factor(ArtistType),
ArtistGender = as.factor(ArtistGender),
ArtistGen = factor(ArtistGen),
release_date = as.POSIXct(release_date, format = "%m/%d/%y"),
key = factor(key, levels = 0:11),
mode = as.factor(mode),
time_signature = factor(time_signature, levels = c(1,3,4,5)),
popular = factor(ifelse(popularity >=50, 1, 0)))
understanding popularity
hist(data$popularity)
summary(data$popularity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 11.00 23.00 25.88 39.00 94.00
heavily skewed right.
The square root transformation makes it more normal. This will help to meet the multiple linear regression assumptions.
hist(data$popularity^0.5)
Classification of
table(data$popular)/nrow(data)
##
## 0 1
## 0.8812021 0.1187979
if classifying a song as popular when it's score is greater than 50, only ~12% of the data is considered a popular song.
kpop <- dplyr::select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, time_signature, tempo, valence)
General Assumptions: continuous response: popularity score ranging from 0 - 100. mix of categorical and continuous response. the distribution of the variables are not normal, we will check for normality of error terms where appropriate.
Goal: create model for predicting popularity scores
select just audio features
kpop <- select(data, popularity, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
kpop0 <- kpop %>% filter(mode == 0)%>% select(-mode)
kpop1 <- kpop %>% filter(mode == 1) %>% select(-mode)
### Kpop mode 0 train and test
smpl.size0 <- floor(0.75*nrow(kpop0))
set.seed(123)
smpl0 <- sample(nrow(kpop0), smpl.size0, replace = FALSE)
train0 <- kpop0[smpl0,]
test0 <- kpop0[-smpl0,]
### Kpop mode 1 train and test
smpl.size1 <- floor(0.75*nrow(kpop1))
set.seed(123)
smpl1 <- sample(nrow(kpop1), smpl.size1, replace = FALSE)
train1 <- kpop1[smpl1,]
test1 <- kpop1[-smpl1,]
fit a multiple linear regression model
ml0 <- lm(popularity ~. , data = train0)
summary(ml0)
##
## Call:
## lm(formula = popularity ~ ., data = train0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.283 -13.161 -1.606 11.489 64.381
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 98.43022 4.99248 19.716 < 2e-16 ***
## duration -5.41104 0.55905 -9.679 < 2e-16 ***
## acousticness -3.81930 2.05303 -1.860 0.062924 .
## danceability -2.19279 3.28543 -0.667 0.504544
## energy -33.97108 3.33505 -10.186 < 2e-16 ***
## instrumentalness -15.67494 4.37625 -3.582 0.000346 ***
## key1 -1.20599 1.49947 -0.804 0.421291
## key2 -2.72991 1.85598 -1.471 0.141416
## key3 3.04084 2.07581 1.465 0.143040
## key4 -4.34609 1.46983 -2.957 0.003129 **
## key5 -0.56490 1.42695 -0.396 0.692217
## key6 1.34681 1.48984 0.904 0.366062
## key7 0.04123 1.63277 0.025 0.979856
## key8 -1.57381 1.76361 -0.892 0.372251
## key9 -0.94547 1.51224 -0.625 0.531872
## key10 -1.75720 1.55635 -1.129 0.258955
## key11 -2.41842 1.38950 -1.740 0.081861 .
## loudness 3.55623 0.20271 17.543 < 2e-16 ***
## speechiness 23.77941 4.74755 5.009 5.75e-07 ***
## tempo -0.00429 0.01255 -0.342 0.732398
## valence -11.88489 1.78863 -6.645 3.51e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.34 on 3483 degrees of freedom
## Multiple R-squared: 0.143, Adjusted R-squared: 0.138
## F-statistic: 29.05 on 20 and 3483 DF, p-value: < 2.2e-16
plot(ml0)
no or little multicollinearity
no autocorrelation
no homoscedasticity.
Try again with tranformation to make popularity normal:(to the squareroot)
ml0.sqrt <- lm(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5))
summary(ml0.sqrt)
##
## Call:
## lm(formula = popularity ~ ., data = train0 %>% mutate(popularity = popularity^0.5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9651 -1.2163 0.1704 1.3715 5.2110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.8026945 0.5397564 23.719 < 2e-16 ***
## duration -0.5857021 0.0604408 -9.691 < 2e-16 ***
## acousticness -0.1418036 0.2219612 -0.639 0.522952
## danceability -0.1381154 0.3552010 -0.389 0.697420
## energy -3.9085950 0.3605651 -10.840 < 2e-16 ***
## instrumentalness -2.2443174 0.4731338 -4.744 2.18e-06 ***
## key1 -0.1306867 0.1621132 -0.806 0.420214
## key2 -0.3754834 0.2006575 -1.871 0.061392 .
## key3 0.1823478 0.2244238 0.813 0.416551
## key4 -0.5438271 0.1589089 -3.422 0.000628 ***
## key5 -0.1190871 0.1542733 -0.772 0.440212
## key6 0.1383520 0.1610728 0.859 0.390433
## key7 -0.0235720 0.1765251 -0.134 0.893779
## key8 -0.1566102 0.1906708 -0.821 0.411495
## key9 -0.1520276 0.1634941 -0.930 0.352505
## key10 -0.2351475 0.1682632 -1.397 0.162353
## key11 -0.2909129 0.1502246 -1.937 0.052885 .
## loudness 0.4225379 0.0219162 19.280 < 2e-16 ***
## speechiness 2.1574574 0.5132766 4.203 2.70e-05 ***
## tempo -0.0007701 0.0013563 -0.568 0.570207
## valence -1.1893401 0.1933758 -6.150 8.60e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.874 on 3483 degrees of freedom
## Multiple R-squared: 0.1594, Adjusted R-squared: 0.1546
## F-statistic: 33.03 on 20 and 3483 DF, p-value: < 2.2e-16
assumption checking and diagnostics
plot(ml0.sqrt)
prediction on test data
# prediction on test data
yhat.mlr = predict(ml0.sqrt, newdata = test0 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test0$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
RMSE = RMSE(yhat.mlr, test0$popularity^0.5),
R2 = R2(yhat.mlr, test0$popularity^0.5)
)
## RMSE R2
## 1 1.863478 0.1332768
Much improved!
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5),
method = "lmStepAIC", trControl = train_control))
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection
##
## 3504 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3153, 3152, 3154, 3154, 3154, 3154, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.878512 0.1528544 1.516643
mlr.step_kcv$finalModel
##
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness +
## key2 + key3 + key4 + key6 + key11 + loudness + speechiness +
## valence, data = dat)
##
## Coefficients:
## (Intercept) duration energy instrumentalness
## 12.4040 -0.5830 -3.8007 -2.2441
## key2 key3 key4 key6
## -0.2588 0.3053 -0.4241 0.2617
## key11 loudness speechiness valence
## -0.1689 0.4214 2.1000 -1.2328
prediction on test data
# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test0) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test0$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
RMSE = RMSE(yhat.step_kcv, test0$popularity^0.5),
R2 = R2(yhat.step_kcv, test0$popularity^0.5)
)
## RMSE R2
## 1 1.861179 0.1350903
Both of the models have RMSE scores of around 2. This isn't terrible since the range of the transformed popularity scores is 0-10.
lm.ridge0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge0$bestTune
## alpha lambda
## 47 0 0.047
# best coefficient
lm.ridge0.model <- coef(lm.ridge0$finalModel, lm.ridge0$bestTune$lambda)
lm.ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 12.3411323068
## duration -0.5762465119
## acousticness -0.0662308488
## danceability -0.1493986374
## energy -3.5601398635
## instrumentalness -2.2691735701
## key1 -0.0831090967
## key2 -0.3241117452
## key3 0.2260133231
## key4 -0.4931505942
## key5 -0.0698858769
## key6 0.1837187046
## key7 0.0173666066
## key8 -0.1059775161
## key9 -0.1084064701
## key10 -0.1844827951
## key11 -0.2447391761
## loudness 0.3990852347
## speechiness 2.0404769494
## tempo -0.0007694543
## valence -1.1748730700
# prediction on test data
yhat.ridge0 = predict(lm.ridge0, s=lm.ridge0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
RMSE = RMSE(yhat.ridge0, test0$popularity^0.5),
R2 = R2(yhat.ridge0, test0$popularity^0.5)
)
## RMSE R2
## 1 1.861093 0.1339601
lm.lasso0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso0$bestTune
## alpha lambda
## 17 1 0.017
# best coefficient
lm.lasso0.model <- coef(lm.lasso0$finalModel, lm.lasso0$bestTune$lambda)
lm.lasso0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 11.92608192
## duration -0.55317885
## acousticness .
## danceability .
## energy -3.50918126
## instrumentalness -2.05824555
## key1 .
## key2 -0.16147902
## key3 0.21698994
## key4 -0.36337170
## key5 .
## key6 0.22354434
## key7 0.03593591
## key8 .
## key9 .
## key10 -0.04408683
## key11 -0.12086307
## loudness 0.39934685
## speechiness 1.77557012
## tempo .
## valence -1.14005299
# prediction on test data
yhat.lasso0 = predict(lm.lasso0, s=lm.lasso0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso0 <- yhat.lasso0 - test0$popularity^0.5
rmse.lasso0 <- sqrt(mean(error.lasso0^2))
data.frame(
RMSE = RMSE(yhat.lasso0, test0$popularity^0.5),
R2 = R2(yhat.lasso0, test0$popularity^0.5)
)
## RMSE R2
## 1 1.858953 0.1351553
lm.enet0 <- train(popularity ~. , data = train0 %>% mutate(popularity = popularity^0.5), method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet0$bestTune
# best coefficient
lm.enet0.model <- coef(lm.enet0$finalModel, lm.enet0$bestTune$lambda)
lm.enet0.model
# prediction on test data
yhat.enet0 = predict(lm.enet0, s=lm.enet0$bestTune,newdata=test0%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet0 <- yhat.enet0 - test0$popularity^0.5
rmse.enet0 <- sqrt(mean(error.enet0^2))
data.frame(
RMSE = RMSE(yhat.enet0, test0$popularity^0.5),
R2 = R2(yhat.enet0, test0$popularity^0.5)
)
ml1.sqrt <- lm(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5))
summary(ml1.sqrt)
##
## Call:
## lm(formula = popularity ~ ., data = train1 %>% mutate(popularity = popularity^0.5))
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4223 -1.2067 0.1313 1.3255 5.2394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.857177 0.401243 27.059 < 2e-16 ***
## duration -0.554933 0.044683 -12.419 < 2e-16 ***
## acousticness 0.080560 0.148271 0.543 0.586927
## danceability 0.050885 0.254514 0.200 0.841542
## energy -3.077791 0.280908 -10.957 < 2e-16 ***
## instrumentalness -1.877299 0.352964 -5.319 1.09e-07 ***
## key1 0.321294 0.091623 3.507 0.000457 ***
## key2 0.207166 0.098073 2.112 0.034699 *
## key3 0.430613 0.150398 2.863 0.004210 **
## key4 -0.029116 0.127272 -0.229 0.819058
## key5 0.204278 0.111709 1.829 0.067504 .
## key6 0.216946 0.109298 1.985 0.047206 *
## key7 0.283270 0.091145 3.108 0.001894 **
## key8 0.297098 0.105736 2.810 0.004975 **
## key9 0.270884 0.113464 2.387 0.017001 *
## key10 0.080066 0.132171 0.606 0.544685
## key11 0.389907 0.117678 3.313 0.000928 ***
## loudness 0.386256 0.017387 22.216 < 2e-16 ***
## speechiness 4.579814 0.414965 11.037 < 2e-16 ***
## tempo -0.001474 0.000958 -1.538 0.124055
## valence -0.849286 0.154553 -5.495 4.08e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.821 on 5483 degrees of freedom
## Multiple R-squared: 0.1499, Adjusted R-squared: 0.1468
## F-statistic: 48.35 on 20 and 5483 DF, p-value: < 2.2e-16
prediction on test data
# prediction on test data
yhat.mlr = predict(ml1.sqrt, newdata = test1 %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.mlr <- yhat.mlr - test1$popularity^0.5
rmse.mlr <- sqrt(mean(error.mlr^2))
data.frame(
RMSE = RMSE(yhat.mlr, test1$popularity^0.5),
R2 = R2(yhat.mlr, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858473 0.1257546
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
bye <- capture.output(mlr.step_kcv <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5),
method = "lmStepAIC", trControl = train_control))
print(mlr.step_kcv)
## Linear Regression with Stepwise Selection
##
## 5504 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4953, 4954, 4954, 4955, 4953, 4953, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.823965 0.1452973 1.468792
mlr.step_kcv$finalModel
##
## Call:
## lm(formula = .outcome ~ duration + energy + instrumentalness +
## key1 + key2 + key3 + key5 + key6 + key7 + key8 + key9 + key11 +
## loudness + speechiness + tempo + valence, data = dat)
##
## Coefficients:
## (Intercept) duration energy instrumentalness
## 10.984803 -0.555451 -3.160519 -1.884462
## key1 key2 key3 key5
## 0.311835 0.198954 0.425083 0.196319
## key6 key7 key8 key9
## 0.209863 0.274552 0.288519 0.261604
## key11 loudness speechiness tempo
## 0.383154 0.387173 4.578753 -0.001535
## valence
## -0.836556
prediction on test data
# prediction on test data
yhat.step_kcv = predict(mlr.step_kcv$finalModel, newdata=key.dummy(test1) %>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.step_kcv <- yhat.step_kcv - test1$popularity^0.5
rmse.step_kcv <- sqrt(mean(error.step_kcv^2))
data.frame(
RMSE = RMSE(yhat.step_kcv, test1$popularity^0.5),
R2 = R2(yhat.step_kcv, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858518 0.1257011
lm.ridge1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.ridge1$bestTune
## alpha lambda
## 45 0 0.045
# best coefficient
lm.ridge1.model <- coef(lm.ridge1$finalModel, lm.ridge1$bestTune$lambda)
lm.ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 10.347839763
## duration -0.543796248
## acousticness 0.169898922
## danceability 0.049633951
## energy -2.620990536
## instrumentalness -1.927744817
## key1 0.293618095
## key2 0.182685249
## key3 0.399431145
## key4 -0.049409156
## key5 0.179350934
## key6 0.188771865
## key7 0.259698734
## key8 0.273385099
## key9 0.249069582
## key10 0.051821098
## key11 0.361138772
## loudness 0.355806469
## speechiness 4.325244621
## tempo -0.001381351
## valence -0.858954968
# prediction on test data
yhat.ridge1 = predict(lm.ridge1, s=lm.ridge1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
data.frame(
RMSE = RMSE(yhat.ridge1, test1$popularity^0.5),
R2 = R2(yhat.ridge1, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858182 0.1253763
lm.lasso1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
#Best Tuning Parameter
lm.lasso1$bestTune
## alpha lambda
## 3 1 0.003
# best coefficient
lm.lasso1.model <- coef(lm.lasso1$finalModel, lm.lasso1$bestTune$lambda)
lm.lasso1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 10.823106069
## duration -0.549904268
## acousticness 0.078797866
## danceability 0.007432375
## energy -3.012459438
## instrumentalness -1.855427319
## key1 0.268514415
## key2 0.153367880
## key3 0.368124062
## key4 -0.060812240
## key5 0.148197878
## key6 0.160460653
## key7 0.231766196
## key8 0.242516531
## key9 0.215811156
## key10 0.020377237
## key11 0.334220735
## loudness 0.381054835
## speechiness 4.504126435
## tempo -0.001394241
## valence -0.829567095
# prediction on test data
yhat.lasso1 = predict(lm.lasso1, s=lm.lasso1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.lasso1 <- yhat.lasso1 - test1$popularity^0.5
rmse.lasso1 <- sqrt(mean(error.lasso1^2))
data.frame(
RMSE = RMSE(yhat.lasso1, test1$popularity^0.5),
R2 = R2(yhat.lasso1, test1$popularity^0.5)
)
## RMSE R2
## 1 1.858393 0.1255584
lm.enet1 <- train(popularity ~. , data = train1 %>% mutate(popularity = popularity^0.5), method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lm.enet1$bestTune
# best coefficient
lm.enet1.model <- coef(lm.enet1$finalModel, lm.enet1$bestTune$lambda)
lm.enet1.model
# prediction on test data
yhat.enet1 = predict(lm.enet1, s=lm.enet1$bestTune,newdata=test1%>% mutate(popularity = popularity^0.5))
# RMSE for test data
error.enet1 <- yhat.enet1 - test1$popularity^0.5
rmse.enet1 <- sqrt(mean(error.enet1^2))
data.frame(
RMSE = RMSE(yhat.enet1, test1$popularity^0.5),
R2 = R2(yhat.enet1, test1$popularity^0.5)
)
Use 70% train, 15% validation, 15% test, to use validation for finding optimal cutoff value.
kpop.logit <- select(data, popular, duration, acousticness, danceability, energy, instrumentalness, key, loudness, mode, speechiness, tempo, valence)
logit.kpop0 <- kpop.logit %>% filter(mode == 0)%>% select(-mode)
logit.kpop1 <- kpop.logit %>% filter(mode == 1) %>% select(-mode)
### Kpop mode 0 train and test
# smpl.size0 <- floor(0.75*nrow(logit.kpop0))
# set.seed(123)
# smpl0 <- sample(nrow(logit.kpop0), smpl.size0, replace = FALSE)
# og.logit.train0 <- logit.kpop0[smpl0,]
# og.logit.test0 <- logit.kpop0[-smpl0,]
p3 <- partition.3(logit.kpop0, 0.70, 0.15)
logit.train0 <- p3$data.train
logit.valid0 <- p3$data.val
logit.test0 <- p3$data.test
all.train0 <- rbind(logit.train0, logit.valid0)
# ### Kpop mode 1 train and test
# smpl.size1 <- floor(0.75*nrow(logit.kpop1))
# set.seed(123)
# smpl1 <- sample(nrow(logit.kpop1), smpl.size1, replace = FALSE)
# logit.train1 <- logit.kpop1[smpl1,]
# logit.test1 <- logit.kpop1[-smpl1,]
p3 <- partition.3(logit.kpop1, 0.70, 0.15)
logit.train1 <- p3$data.train
logit.valid1 <- p3$data.val
logit.test1 <- p3$data.test
all.train1 <- rbind(logit.train1, logit.valid1)
fitting logistic model using combo of train/valid/test, finding optimal model using training data.
# Fit logistic model on training data
v.logit.model0 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train0)
#search for best cutoff
out0 <- opt.cut.func(v.logit.model0, logit.valid0)
opt.cut.plot(out0)
out0$cutoff[which.min(out0$ssdiff.vec)]
## [1] 0.13
v.opt.cut0 <- out0$cutoff[which.max(out0$kappa.vec)]
v.opt.cut0
## [1] 0.16
Fit final model (combo of train and validation)
v.model.final <- glm(popular ~ ., data=all.train0, family=binomial(link='logit'))
predict on test using 0.5 cutoff
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 600 99
## 1 1 1
##
## Accuracy : 0.8573
## 95% CI : (0.8292, 0.8824)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 0.5266
##
## Kappa : 0.0141
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010000
## Specificity : 0.998336
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.858369
## Prevalence : 0.142653
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504168
##
## 'Positive' Class : 1
##
predict on optimal cutoff (0.16)
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > v.opt.cut0, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 452 54
## 1 149 46
##
## Accuracy : 0.7104
## 95% CI : (0.6753, 0.7438)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1519
##
## Mcnemar's Test P-Value : 4.181e-11
##
## Sensitivity : 0.46000
## Specificity : 0.75208
## Pos Pred Value : 0.23590
## Neg Pred Value : 0.89328
## Prevalence : 0.14265
## Detection Rate : 0.06562
## Detection Prevalence : 0.27817
## Balanced Accuracy : 0.60604
##
## 'Positive' Class : 1
##
v.prob.test <- predict(v.logit.model0, newdata = logit.test0, type = "response")
v.pred.test <- ifelse(v.prob.test > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(v.pred.test), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 359 39
## 1 242 61
##
## Accuracy : 0.5991
## 95% CI : (0.5618, 0.6357)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1123
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.61000
## Specificity : 0.59734
## Pos Pred Value : 0.20132
## Neg Pred Value : 0.90201
## Prevalence : 0.14265
## Detection Rate : 0.08702
## Detection Prevalence : 0.43224
## Balanced Accuracy : 0.60367
##
## 'Positive' Class : 1
##
step-wise 10 fold cross validation
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
# Fit K-fold CV model
meh <- capture.output(step0_kcv <- train(popular ~ ., data = logit.train0, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step0_kcv <- step0_kcv$finalModel
step0_kcv
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness energy
## 5.1001 -0.4797 -1.6165 -3.9184
## instrumentalness key3 key6 loudness
## -12.3907 0.6991 0.3999 0.3236
## speechiness valence
## 3.2522 -1.5547
##
## Degrees of Freedom: 3270 Total (i.e. Null); 3261 Residual
## Null Deviance: 2534
## Residual Deviance: 2377 AIC: 2397
kcv.prob.test0 <- predict(step0_kcv, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
Find optimal cut off value:
#search for best cutoff
kcv.out0 <- opt.cut.func(step0_kcv, key.dummy(logit.valid0))
opt.cut.plot(kcv.out0)
kcv.out0$cutoff[which.min(kcv.out0$ssdiff.vec)]
## [1] 0.13
kcv.out0.cut0 <- kcv.out0$cutoff[which.max(kcv.out0$kappa.vec)]
kcv.out0.cut0
## [1] 0.17
Fit final model (combo of train and validation)
finalmeh <- capture.output(step0_kcv.final0 <- train(popular ~ ., data = all.train0, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
step0_kcv.final0 <- step0_kcv.final0$finalModel
step0_kcv.final0
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness energy
## 5.1320 -0.5246 -1.6542 -3.7039
## instrumentalness key3 key4 key6
## -13.3965 0.5700 -0.2640 0.2858
## loudness speechiness valence
## 0.3295 3.1544 -1.4994
##
## Degrees of Freedom: 3971 Total (i.e. Null); 3961 Residual
## Null Deviance: 3079
## Residual Deviance: 2888 AIC: 2910
predict on test using 0.5 cutoff
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 600 99
## 1 1 1
##
## Accuracy : 0.8573
## 95% CI : (0.8292, 0.8824)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 0.5266
##
## Kappa : 0.0141
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010000
## Specificity : 0.998336
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.858369
## Prevalence : 0.142653
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504168
##
## 'Positive' Class : 1
##
predict on test using 0.16 cutoff
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.16, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 451 53
## 1 150 47
##
## Accuracy : 0.7104
## 95% CI : (0.6753, 0.7438)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.157
##
## Mcnemar's Test P-Value : 1.607e-11
##
## Sensitivity : 0.47000
## Specificity : 0.75042
## Pos Pred Value : 0.23858
## Neg Pred Value : 0.89484
## Prevalence : 0.14265
## Detection Rate : 0.06705
## Detection Prevalence : 0.28103
## Balanced Accuracy : 0.61021
##
## 'Positive' Class : 1
##
predict on test using 0.13 cutoff
kcv.prob.test0 <- predict(step0_kcv.final0, newdata = key.dummy(logit.test0), type = "response")
kcv.pred.test0 <- ifelse(kcv.prob.test0 > 0.13, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(kcv.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 362 26
## 1 239 74
##
## Accuracy : 0.622
## 95% CI : (0.5849, 0.658)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1813
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7400
## Specificity : 0.6023
## Pos Pred Value : 0.2364
## Neg Pred Value : 0.9330
## Prevalence : 0.1427
## Detection Rate : 0.1056
## Detection Prevalence : 0.4465
## Balanced Accuracy : 0.6712
##
## 'Positive' Class : 1
##
glmulti.out0 <- glmulti(popular ~ ., data = logit.train0,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out0@formulas
view summary of top model
summary(glmulti.out0@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4042 -0.5732 -0.4633 -0.3486 2.4660
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.24034 0.74622 7.023 2.18e-12 ***
## duration -0.47455 0.10297 -4.609 4.05e-06 ***
## acousticness -1.62079 0.37812 -4.286 1.82e-05 ***
## energy -4.03171 0.61485 -6.557 5.48e-11 ***
## instrumentalness -12.73308 10.34054 -1.231 0.218
## loudness 0.32748 0.04685 6.991 2.74e-12 ***
## speechiness 3.16638 0.76128 4.159 3.19e-05 ***
## valence -1.50323 0.27942 -5.380 7.46e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2534.5 on 3270 degrees of freedom
## Residual deviance: 2389.0 on 3263 degrees of freedom
## AIC: 2405
##
## Number of Fisher Scoring iterations: 8
Store model
allreg.logit0 <- glmulti.out0@objects[[1]]
#search for best cutoff
allreg.out0 <- opt.cut.func(allreg.logit0, logit.valid0)
opt.cut.plot(allreg.out0)
allreg.out0$cutoff[which.min(allreg.out0$ssdiff.vec)]
## [1] 0.13
allreg.out0.cut0 <- allreg.out0$cutoff[which.max(allreg.out0$kappa.vec)]
allreg.out0.cut0
## [1] 0.17
fit final model to combo of training and validation
glmulti.out0 <- glmulti(popular ~ ., data = all.train0,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
summary(glmulti.out0@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3796 -0.5760 -0.4646 -0.3454 2.4844
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.20379 0.68148 7.636 2.24e-14 ***
## duration -0.51635 0.09493 -5.439 5.36e-08 ***
## acousticness -1.66774 0.34909 -4.777 1.78e-06 ***
## energy -3.81732 0.55999 -6.817 9.31e-12 ***
## instrumentalness -13.79437 9.65872 -1.428 0.153
## loudness 0.33261 0.04270 7.789 6.73e-15 ***
## speechiness 3.06337 0.68698 4.459 8.23e-06 ***
## valence -1.42598 0.25451 -5.603 2.11e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3079.5 on 3971 degrees of freedom
## Residual deviance: 2900.1 on 3964 degrees of freedom
## AIC: 2916.1
##
## Number of Fisher Scoring iterations: 9
store final model
allreg.logit0.final <- glmulti.out0@objects[[1]]
Predictions with 0.5 as the cut off
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 600 99
## 1 1 1
##
## Accuracy : 0.8573
## 95% CI : (0.8292, 0.8824)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 0.5266
##
## Kappa : 0.0141
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010000
## Specificity : 0.998336
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.858369
## Prevalence : 0.142653
## Detection Rate : 0.001427
## Detection Prevalence : 0.002853
## Balanced Accuracy : 0.504168
##
## 'Positive' Class : 1
##
Predictions where cut off is the best kappa
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > allreg.out0.cut0, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 472 59
## 1 129 41
##
## Accuracy : 0.7318
## 95% CI : (0.6974, 0.7643)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1512
##
## Mcnemar's Test P-Value : 4.845e-07
##
## Sensitivity : 0.41000
## Specificity : 0.78536
## Pos Pred Value : 0.24118
## Neg Pred Value : 0.88889
## Prevalence : 0.14265
## Detection Rate : 0.05849
## Detection Prevalence : 0.24251
## Balanced Accuracy : 0.59768
##
## 'Positive' Class : 1
##
Predictions where cut off is the best balance of sensitivity and specificity
allreg.prob.test0 <- predict(allreg.logit0.final, newdata = logit.test0, type = "response")
allreg.pred.test0 <- ifelse(allreg.prob.test0 > 0.14, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test0), as.factor(logit.test0$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 383 33
## 1 218 67
##
## Accuracy : 0.6419
## 95% CI : (0.6052, 0.6775)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1735
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.67000
## Specificity : 0.63727
## Pos Pred Value : 0.23509
## Neg Pred Value : 0.92067
## Prevalence : 0.14265
## Detection Rate : 0.09558
## Detection Prevalence : 0.40656
## Balanced Accuracy : 0.65364
##
## 'Positive' Class : 1
##
ridge0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
## alpha lambda
## 100 0 0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.1915545704
## duration -0.2316446189
## acousticness -0.4391019903
## danceability -0.0498944180
## energy -0.6681396288
## instrumentalness -1.0177900019
## key1 0.0754947879
## key2 0.0713988992
## key3 0.3855850943
## key4 -0.1246140988
## key5 -0.0585100854
## key6 0.2136400771
## key7 0.0077267527
## key8 0.1054770688
## key9 -0.0325615911
## key10 -0.0408322593
## key11 -0.0604665387
## loudness 0.0766007935
## speechiness 1.5946326720
## tempo 0.0001130078
## valence -0.7596590057
Search for best cutoff using validation set
ridge0.out <- reg.opt.cut.func(ridge0, logit.valid0)
opt.cut.plot(ridge0.out)
# cut off by kappa
ridge0.out$cutoff[which.max(ridge0.out$kappa.vec)]
## [1] 0.14
ridge0.out$cutoff[which.min(ridge0.out$ssdiff.vec)]
## [1] 0.13
create final model
ridge0 <- train(popular ~ ., data = all.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge0$bestTune
## alpha lambda
## 100 0 0.1
ridge0.model <- coef(ridge0$finalModel, ridge0$bestTune$lambda)
ridge0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 8.530228e-02
## duration -2.435653e-01
## acousticness -4.630692e-01
## danceability 7.287529e-02
## energy -5.554047e-01
## instrumentalness -9.877648e-01
## key1 7.474446e-02
## key2 8.256308e-02
## key3 3.236407e-01
## key4 -1.290358e-01
## key5 3.251688e-02
## key6 1.653136e-01
## key7 -1.463599e-02
## key8 1.009955e-01
## key9 -7.740618e-02
## key10 -2.259250e-02
## key11 -7.828105e-02
## loudness 8.041691e-02
## speechiness 1.589226e+00
## tempo -6.196997e-05
## valence -7.257510e-01
predict and evaluate on the test set where cutoff is at 0.5
prob.ridge0 <- predict(ridge0, s = ridge0$bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 601 100
## 1 0 0
##
## Accuracy : 0.8573
## 95% CI : (0.8292, 0.8824)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 0.5266
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8573
## Prevalence : 0.1427
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa
prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.12, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 226 20
## 1 375 80
##
## Accuracy : 0.4365
## 95% CI : (0.3994, 0.4741)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.071
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8000
## Specificity : 0.3760
## Pos Pred Value : 0.1758
## Neg Pred Value : 0.9187
## Prevalence : 0.1427
## Detection Rate : 0.1141
## Detection Prevalence : 0.6491
## Balanced Accuracy : 0.5880
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal sensitivity and specificity balance
prob.ridg0 <- predict(ridge0, s = ridge0bestTune, logit.test0, type = "prob")
pred.ridge0 <- ifelse(prob.ridge0[,2] > 0.14, 1, 0) # using cutoff = 0.14
confusionMatrix(as.factor(pred.ridge0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 407 48
## 1 194 52
##
## Accuracy : 0.6548
## 95% CI : (0.6183, 0.69)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1226
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.52000
## Specificity : 0.67720
## Pos Pred Value : 0.21138
## Neg Pred Value : 0.89451
## Prevalence : 0.14265
## Detection Rate : 0.07418
## Detection Prevalence : 0.35093
## Balanced Accuracy : 0.59860
##
## 'Positive' Class : 1
##
enet0 <- train(popular ~ ., data = logit.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.1915545704
## duration -0.2316446189
## acousticness -0.4391019903
## danceability -0.0498944180
## energy -0.6681396288
## instrumentalness -1.0177900019
## key1 0.0754947879
## key2 0.0713988992
## key3 0.3855850943
## key4 -0.1246140988
## key5 -0.0585100854
## key6 0.2136400771
## key7 0.0077267527
## key8 0.1054770688
## key9 -0.0325615911
## key10 -0.0408322593
## key11 -0.0604665387
## loudness 0.0766007935
## speechiness 1.5946326720
## tempo 0.0001130078
## valence -0.7596590057
search for best cutoff with validation set
enet0.out <- reg.opt.cut.func(enet0, logit.valid0)
opt.cut.plot(enet0.out)
# cut off by kappa
enet0.out$cutoff[which.max(enet0.out$kappa.vec)]
## [1] 0.14
enet0.out$cutoff[which.min(enet0.out$ssdiff.vec)]
## [1] 0.13
create final model on train and validation
enet0 <- train(popular ~ ., data = all.train0, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet0$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet0.model <- coef(enet0$finalModel, enet0$bestTune$lambda)
enet0.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 8.530228e-02
## duration -2.435653e-01
## acousticness -4.630692e-01
## danceability 7.287529e-02
## energy -5.554047e-01
## instrumentalness -9.877648e-01
## key1 7.474446e-02
## key2 8.256308e-02
## key3 3.236407e-01
## key4 -1.290358e-01
## key5 3.251688e-02
## key6 1.653136e-01
## key7 -1.463599e-02
## key8 1.009955e-01
## key9 -7.740618e-02
## key10 -2.259250e-02
## key11 -7.828105e-02
## loudness 8.041691e-02
## speechiness 1.589226e+00
## tempo -6.196997e-05
## valence -7.257510e-01
predict and evaluate on the test set where cutoff is at 0.5
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet0),
## as.factor(logit.test0$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 601 100
## 1 0 0
##
## Accuracy : 0.8573
## 95% CI : (0.8292, 0.8824)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 0.5266
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.8573
## Prevalence : 0.1427
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.12 corresponding to optimal kappa
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.12, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 226 20
## 1 375 80
##
## Accuracy : 0.4365
## 95% CI : (0.3994, 0.4741)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.071
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8000
## Specificity : 0.3760
## Pos Pred Value : 0.1758
## Neg Pred Value : 0.9187
## Prevalence : 0.1427
## Detection Rate : 0.1141
## Detection Prevalence : 0.6491
## Balanced Accuracy : 0.5880
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.14 corresponding to optimal balance of sensitivity and specificity
prob.enet0 <- predict(enet0, s = enet0$bestTune, logit.test0, type = "prob")
pred.enet0 <- ifelse(prob.enet0[,2] > 0.14, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet0), as.factor(logit.test0$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 407 48
## 1 194 52
##
## Accuracy : 0.6548
## 95% CI : (0.6183, 0.69)
## No Information Rate : 0.8573
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1226
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.52000
## Specificity : 0.67720
## Pos Pred Value : 0.21138
## Neg Pred Value : 0.89451
## Prevalence : 0.14265
## Detection Rate : 0.07418
## Detection Prevalence : 0.35093
## Balanced Accuracy : 0.59860
##
## 'Positive' Class : 1
##
fitting logistic model using combo of train/valid/test, finding optimal model using training data.
# Fit logistic model on training data
v.logit.model1 <- glm(popular ~ ., family=binomial(link='logit'),data= logit.train1)
#search for best cutoff
out1 <- opt.cut.func(v.logit.model1, logit.valid1)
opt.cut.plot(out1)
out1$cutoff[which.min(out1$ssdiff.vec)]
## [1] 0.11
v.opt.cut1 <- out1$cutoff[which.max(out1$kappa.vec)]
v.opt.cut1
## [1] 0.16
Fit final model (combo of train and validation)
v.model.final1 <- glm(popular ~ ., data=all.train1, family=binomial(link='logit'))
predict on test using 0.5 cutoff
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.5, 1, 0) # using cutoff = 0.15
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 990 109
## 1 1 1
##
## Accuracy : 0.9001
## 95% CI : (0.8808, 0.9172)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 0.5254
##
## Kappa : 0.0143
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0090909
## Specificity : 0.9989909
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9008189
## Prevalence : 0.0999092
## Detection Rate : 0.0009083
## Detection Prevalence : 0.0018165
## Balanced Accuracy : 0.5040409
##
## 'Positive' Class : 1
##
predict on test using optimal cutoff (kappa)
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > v.opt.cut1, 1, 0) # using cutoff = 0.16
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 800 71
## 1 191 39
##
## Accuracy : 0.762
## 95% CI : (0.7357, 0.7869)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.109
##
## Mcnemar's Test P-Value : 1.955e-13
##
## Sensitivity : 0.35455
## Specificity : 0.80727
## Pos Pred Value : 0.16957
## Neg Pred Value : 0.91848
## Prevalence : 0.09991
## Detection Rate : 0.03542
## Detection Prevalence : 0.20890
## Balanced Accuracy : 0.58091
##
## 'Positive' Class : 1
##
predict on test using optimal cutoff (sensitivity and specificity)
v.prob.test1 <- predict(v.logit.model1, newdata = logit.test1, type = "response")
v.pred.test1 <- ifelse(v.prob.test1 > 0.11, 1, 0) # using cutoff = 0.11
confusionMatrix(as.factor(v.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 560 39
## 1 431 71
##
## Accuracy : 0.5731
## 95% CI : (0.5433, 0.6026)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0815
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.64545
## Specificity : 0.56509
## Pos Pred Value : 0.14143
## Neg Pred Value : 0.93489
## Prevalence : 0.09991
## Detection Rate : 0.06449
## Detection Prevalence : 0.45595
## Balanced Accuracy : 0.60527
##
## 'Positive' Class : 1
##
step-wise 10 fold cross validation
set.seed(123)
train_control <- trainControl(method = "cv",
number = 10)
# Fit K-fold CV model
meh <- capture.output(step1_kcv <- train(popular ~ ., data = logit.train1, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv <- step1_kcv$finalModel
step1_kcv
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness danceability
## 4.1871 -0.4871 -0.7884 0.7627
## energy instrumentalness key4 key8
## -3.7851 -13.4344 -0.4462 0.2437
## key10 loudness speechiness valence
## -0.5490 0.3480 5.4057 -1.4309
##
## Degrees of Freedom: 5136 Total (i.e. Null); 5125 Residual
## Null Deviance: 3675
## Residual Deviance: 3432 AIC: 3456
#search for best cutoff
kcv.out1 <- opt.cut.func(step1_kcv, key.dummy(logit.valid1))
opt.cut.plot(kcv.out1)
kcv.out1$cutoff[which.min(kcv.out1$ssdiff.vec)]
## [1] 0.11
kcv.out1.cut1 <- kcv.out1$cutoff[which.max(kcv.out1$kappa.vec)]
kcv.out1.cut1
## [1] 0.18
Fit final model (combo of train and validation)
finalmeh <- capture.output(step1_kcv.final <- train(popular ~ ., data = all.train1, family = binomial(),
method = "glmStepAIC", trControl = train_control, verbose = FALSE))
step1_kcv.final <- step1_kcv.final$finalModel
step1_kcv.final
##
## Call: NULL
##
## Coefficients:
## (Intercept) duration acousticness danceability
## 4.0904 -0.4978 -0.9406 0.6354
## energy instrumentalness key3 key4
## -3.6817 -13.6231 0.4237 -0.3088
## key8 key10 loudness speechiness
## 0.2698 -0.6307 0.3306 5.1625
## valence
## -1.3215
##
## Degrees of Freedom: 6237 Total (i.e. Null); 6225 Residual
## Null Deviance: 4372
## Residual Deviance: 4096 AIC: 4122
predict on test using 0.5 cutoff
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.5, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 990 110
## 1 1 0
##
## Accuracy : 0.8992
## 95% CI : (0.8799, 0.9163)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 0.5651
##
## Kappa : -0.0018
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000000
## Specificity : 0.9989909
## Pos Pred Value : 0.0000000
## Neg Pred Value : 0.9000000
## Prevalence : 0.0999092
## Detection Rate : 0.0000000
## Detection Prevalence : 0.0009083
## Balanced Accuracy : 0.4994955
##
## 'Positive' Class : 1
##
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > kcv.out1.cut1, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 871 82
## 1 120 28
##
## Accuracy : 0.8165
## 95% CI : (0.7924, 0.839)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.1157
##
## Mcnemar's Test P-Value : 0.009233
##
## Sensitivity : 0.25455
## Specificity : 0.87891
## Pos Pred Value : 0.18919
## Neg Pred Value : 0.91396
## Prevalence : 0.09991
## Detection Rate : 0.02543
## Detection Prevalence : 0.13442
## Balanced Accuracy : 0.56673
##
## 'Positive' Class : 1
##
kcv.prob.test1 <- predict(step1_kcv.final, newdata = key.dummy(logit.test1), type = "response")
kcv.pred.test1 <- ifelse(kcv.prob.test1 > 0.11, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(kcv.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 577 40
## 1 414 70
##
## Accuracy : 0.5876
## 95% CI : (0.5579, 0.6169)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.087
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.63636
## Specificity : 0.58224
## Pos Pred Value : 0.14463
## Neg Pred Value : 0.93517
## Prevalence : 0.09991
## Detection Rate : 0.06358
## Detection Prevalence : 0.43960
## Balanced Accuracy : 0.60930
##
## 'Positive' Class : 1
##
glmulti.out1 <- glmulti(popular ~ ., data = logit.train1,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
view summary of top model
summary(glmulti.out1@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2927 -0.5336 -0.4321 -0.3262 2.7541
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.10815 0.68233 6.021 1.74e-09 ***
## duration -0.48381 0.08923 -5.422 5.89e-08 ***
## acousticness -0.82521 0.27036 -3.052 0.00227 **
## danceability 0.82523 0.42883 1.924 0.05431 .
## energy -3.78352 0.51679 -7.321 2.46e-13 ***
## instrumentalness -13.40830 7.93108 -1.691 0.09091 .
## loudness 0.34743 0.03805 9.130 < 2e-16 ***
## speechiness 5.44074 0.61682 8.821 < 2e-16 ***
## valence -1.40523 0.27010 -5.203 1.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3675.4 on 5136 degrees of freedom
## Residual deviance: 3442.9 on 5128 degrees of freedom
## AIC: 3460.9
##
## Number of Fisher Scoring iterations: 9
Store model
allreg.logit1 <- glmulti.out1@objects[[1]]
#search for best cutoff
allreg.out1 <- opt.cut.func(allreg.logit1, logit.valid1)
opt.cut.plot(allreg.out1)
allreg.out1$cutoff[which.min(allreg.out1$ssdiff.vec)]
## [1] 0.11
allreg.out1.cut1 <- allreg.out1$cutoff[which.max(allreg.out1$kappa.vec)]
allreg.out1.cut1
## [1] 0.16
fit final model to combo of training and validation
glmulti.out1 <- glmulti(popular ~ ., data = all.train1,
level = 1, # No interaction considered
method = "h", # Exhaustive approach
crit = "aic", # AIC as criteria
confsetsize = 5, # Keep 5 best models
plotty = F, report = F, # No plot or interim reports
fitfunction = "glm", # lm function
family = binomial) # binomial to run logistic regression
#glmulti.out1@formulas
summary(glmulti.out1@objects[[1]])
##
## Call:
## fitfunc(formula = as.formula(x), family = ..1, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9085 -0.5249 -0.4297 -0.3287 2.7358
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.06861 0.62756 6.483 8.98e-11 ***
## duration -0.49333 0.08221 -6.001 1.96e-09 ***
## acousticness -0.94155 0.24944 -3.775 0.00016 ***
## danceability 0.63868 0.39311 1.625 0.10423
## energy -3.68922 0.47529 -7.762 8.36e-15 ***
## instrumentalness -13.48141 7.35928 -1.832 0.06697 .
## loudness 0.32993 0.03477 9.488 < 2e-16 ***
## speechiness 5.19880 0.55978 9.287 < 2e-16 ***
## valence -1.30087 0.24869 -5.231 1.69e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4372.3 on 6237 degrees of freedom
## Residual deviance: 4113.3 on 6229 degrees of freedom
## AIC: 4131.3
##
## Number of Fisher Scoring iterations: 9
store final model
allreg.logit1.final <- glmulti.out1@objects[[1]]
Predictions with 0.5 as the cut off
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.5, 1, 0) # using standard cutoff 0.5
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 990 109
## 1 1 1
##
## Accuracy : 0.9001
## 95% CI : (0.8808, 0.9172)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 0.5254
##
## Kappa : 0.0143
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0090909
## Specificity : 0.9989909
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9008189
## Prevalence : 0.0999092
## Detection Rate : 0.0009083
## Detection Prevalence : 0.0018165
## Balanced Accuracy : 0.5040409
##
## 'Positive' Class : 1
##
Predictions where cut off is the best kappa
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > allreg.out1.cut1, 1, 0) # using optimal cutoff 0.17
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 821 75
## 1 170 35
##
## Accuracy : 0.7775
## 95% CI : (0.7517, 0.8017)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.106
##
## Mcnemar's Test P-Value : 1.908e-09
##
## Sensitivity : 0.31818
## Specificity : 0.82846
## Pos Pred Value : 0.17073
## Neg Pred Value : 0.91629
## Prevalence : 0.09991
## Detection Rate : 0.03179
## Detection Prevalence : 0.18619
## Balanced Accuracy : 0.57332
##
## 'Positive' Class : 1
##
Predictions where cut off is the best balance of sensitivity and specificity
allreg.prob.test1 <- predict(allreg.logit1.final, newdata = logit.test1, type = "response")
allreg.pred.test1 <- ifelse(allreg.prob.test1 > 0.11, 1, 0) # using optimal cutoff
confusionMatrix(as.factor(allreg.pred.test1), as.factor(logit.test1$popular), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 585 41
## 1 406 69
##
## Accuracy : 0.594
## 95% CI : (0.5643, 0.6232)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0879
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.62727
## Specificity : 0.59031
## Pos Pred Value : 0.14526
## Neg Pred Value : 0.93450
## Prevalence : 0.09991
## Detection Rate : 0.06267
## Detection Prevalence : 0.43143
## Balanced Accuracy : 0.60879
##
## 'Positive' Class : 1
##
ridge1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
## alpha lambda
## 100 0 0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.7794612024
## duration -0.2103504081
## acousticness -0.1043006389
## danceability 0.1845311234
## energy -0.3424994323
## instrumentalness -0.8360208822
## key1 -0.0052137727
## key2 0.0277371298
## key3 0.1656719284
## key4 -0.1672407313
## key5 0.0652609329
## key6 -0.0837961594
## key7 0.0624070766
## key8 0.1531177410
## key9 0.1399246098
## key10 -0.2261685699
## key11 0.0117086597
## loudness 0.0667931811
## speechiness 2.5045118512
## tempo 0.0005094302
## valence -0.5176594535
Search for best cutoff using validation set
ridge1.out <- reg.opt.cut.func(ridge1, logit.valid1)
opt.cut.plot(ridge1.out)
# cut off by kappa
ridge1.out$cutoff[which.max(ridge1.out$kappa.vec)]
## [1] 0.15
ridge1.out$cutoff[which.min(ridge1.out$ssdiff.vec)]
## [1] 0.12
create final model
ridge1 <- train(popular ~ ., data = all.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 0,lambda = seq(0.001,0.1,by = 0.001)))
#best tuning parameter
ridge1$bestTune
## alpha lambda
## 100 0 0.1
ridge1.model <- coef(ridge1$finalModel, ridge1$bestTune$lambda)
ridge1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.895075690
## duration -0.210354815
## acousticness -0.142604779
## danceability 0.144718060
## energy -0.323758232
## instrumentalness -0.838304065
## key1 0.059858865
## key2 0.012365812
## key3 0.208277667
## key4 -0.125410825
## key5 0.041362313
## key6 -0.023137874
## key7 0.065428774
## key8 0.153045769
## key9 0.084576094
## key10 -0.256890886
## key11 -0.017272069
## loudness 0.062665247
## speechiness 2.357820019
## tempo 0.001015292
## valence -0.474750339
predict and evaluate on the test set where cutoff is at 0.5
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.ridge1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 991 110
## 1 0 0
##
## Accuracy : 0.9001
## 95% CI : (0.8808, 0.9172)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 0.5254
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.90009
## Prevalence : 0.09991
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.13 corresponding to optimal kappa
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 829 78
## 1 162 32
##
## Accuracy : 0.782
## 95% CI : (0.7564, 0.8061)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0951
##
## Mcnemar's Test P-Value : 8.432e-08
##
## Sensitivity : 0.29091
## Specificity : 0.83653
## Pos Pred Value : 0.16495
## Neg Pred Value : 0.91400
## Prevalence : 0.09991
## Detection Rate : 0.02906
## Detection Prevalence : 0.17620
## Balanced Accuracy : 0.56372
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity
prob.ridge1 <- predict(ridge1, s = ridge1$bestTune, logit.test1, type = "prob")
pred.ridge1 <- ifelse(prob.ridge1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.ridge1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 533 32
## 1 458 78
##
## Accuracy : 0.555
## 95% CI : (0.525, 0.5846)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0907
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.70909
## Specificity : 0.53784
## Pos Pred Value : 0.14552
## Neg Pred Value : 0.94336
## Prevalence : 0.09991
## Detection Rate : 0.07084
## Detection Prevalence : 0.48683
## Balanced Accuracy : 0.62347
##
## 'Positive' Class : 1
##
lasso1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
lasso1$bestTune
# best coefficient
lasso1.model <- coef(lasso1$finalModel, lasso1$bestTune$lambda)
lasso1.model
Search for best cutoff using validation set
lasso1.out <- reg.opt.cut.func(lasso1, logit.valid1)
opt.cut.plot(lasso1.out)
# cut off by kappa
lasso1.out$cutoff[which.max(lasso1.out$kappa.vec)]
lasso1.out$cutoff[which.min(lasso1.out$ssdiff.vec)]
enet1 <- train(popular ~ ., data = logit.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
## alpha lambda
## 100 0 0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.7794612024
## duration -0.2103504081
## acousticness -0.1043006389
## danceability 0.1845311234
## energy -0.3424994323
## instrumentalness -0.8360208822
## key1 -0.0052137727
## key2 0.0277371298
## key3 0.1656719284
## key4 -0.1672407313
## key5 0.0652609329
## key6 -0.0837961594
## key7 0.0624070766
## key8 0.1531177410
## key9 0.1399246098
## key10 -0.2261685699
## key11 0.0117086597
## loudness 0.0667931811
## speechiness 2.5045118512
## tempo 0.0005094302
## valence -0.5176594535
search for best cutoff with validation set
enet1.out <- reg.opt.cut.func(enet1, logit.valid1)
opt.cut.plot(enet1.out)
# cut off by kappa
enet1.out$cutoff[which.max(enet1.out$kappa.vec)]
## [1] 0.15
enet1.out$cutoff[which.min(enet1.out$ssdiff.vec)]
## [1] 0.12
create final model on train and validation
enet1 <- train(popular ~ ., data = all.train1, method = "glmnet",
family = "binomial", trControl = train_control,
tuneGrid = expand.grid(alpha = seq(0,1,by = 0.05),lambda = seq(0.001,0.1,by = 0.001)))
# best parameter
enet1$bestTune
## alpha lambda
## 200 0.05 0.1
# best coefficient
enet1.model <- coef(enet1$finalModel, enet1$bestTune$lambda)
enet1.model
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.1792935147
## duration -0.1699469465
## acousticness -0.0051000448
## danceability .
## energy -0.0466349208
## instrumentalness -0.4146454458
## key1 .
## key2 .
## key3 0.0528461274
## key4 -0.0243407299
## key5 .
## key6 .
## key7 .
## key8 0.0564175836
## key9 .
## key10 -0.1479976169
## key11 .
## loudness 0.0475991270
## speechiness 2.0833703322
## tempo 0.0001026083
## valence -0.3234255985
predict and evaluate on the test set where cutoff is at 0.5
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.50, 1, 0) # using cutoff = 0.5
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Warning in confusionMatrix.default(as.factor(pred.enet1),
## as.factor(logit.test1$popular), : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 991 110
## 1 0 0
##
## Accuracy : 0.9001
## 95% CI : (0.8808, 0.9172)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 0.5254
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.90009
## Prevalence : 0.09991
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.13 corresponding to optimal kappa
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.13, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 870 85
## 1 121 25
##
## Accuracy : 0.8129
## 95% CI : (0.7886, 0.8355)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.0918
##
## Mcnemar's Test P-Value : 0.01475
##
## Sensitivity : 0.22727
## Specificity : 0.87790
## Pos Pred Value : 0.17123
## Neg Pred Value : 0.91099
## Prevalence : 0.09991
## Detection Rate : 0.02271
## Detection Prevalence : 0.13261
## Balanced Accuracy : 0.55259
##
## 'Positive' Class : 1
##
predict and evaluate on the test set where cutoff is at 0.11 corresponding to optimal balance of sensitivity and specificity
prob.enet1 <- predict(enet1, s = enet1$bestTune, logit.test1, type = "prob")
pred.enet1 <- ifelse(prob.enet1[,2] > 0.11, 1, 0) # using cutoff = 0.13
confusionMatrix(as.factor(pred.enet1), as.factor(logit.test1$popular),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 511 31
## 1 480 79
##
## Accuracy : 0.5359
## 95% CI : (0.5059, 0.5657)
## No Information Rate : 0.9001
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0831
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.71818
## Specificity : 0.51564
## Pos Pred Value : 0.14132
## Neg Pred Value : 0.94280
## Prevalence : 0.09991
## Detection Rate : 0.07175
## Detection Prevalence : 0.50772
## Balanced Accuracy : 0.61691
##
## 'Positive' Class : 1
##